import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pandas as pd
import plotly.express as px
import pycountry
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pip install pycountry
Requirement already satisfied: pycountry in /Users/harshitha/anaconda3/lib/python3.11/site-packages (23.12.11) Note: you may need to restart the kernel to use updated packages.
df = pd.read_csv("/Users/harshitha/Desktop/Datasets/Supply_Chain_Dataset.csv", encoding='latin-1')
df.head()
| Type | Days for shipping (real) | Days for shipment (scheduled) | Benefit per order | Sales per customer | Delivery Status | Late_delivery_risk | Category Id | Category Name | Customer City | ... | Order Zipcode | Product Card Id | Product Category Id | Product Description | Product Image | Product Name | Product Price | Product Status | shipping date (DateOrders) | Shipping Mode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEBIT | 3 | 4 | 91.250000 | 314.640015 | Advance shipping | 0 | 73 | Sporting Goods | Caguas | ... | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 2/3/2018 22:56 | Standard Class |
| 1 | TRANSFER | 5 | 4 | -249.089996 | 311.359985 | Late delivery | 1 | 73 | Sporting Goods | Caguas | ... | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/18/2018 12:27 | Standard Class |
| 2 | CASH | 4 | 4 | -247.779999 | 309.720001 | Shipping on time | 0 | 73 | Sporting Goods | San Jose | ... | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/17/2018 12:06 | Standard Class |
| 3 | DEBIT | 3 | 4 | 22.860001 | 304.809998 | Advance shipping | 0 | 73 | Sporting Goods | Los Angeles | ... | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/16/2018 11:45 | Standard Class |
| 4 | PAYMENT | 2 | 4 | 134.210007 | 298.250000 | Advance shipping | 0 | 73 | Sporting Goods | Caguas | ... | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/15/2018 11:24 | Standard Class |
5 rows × 53 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 180519 entries, 0 to 180518 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Type 180519 non-null object 1 Days for shipping (real) 180519 non-null int64 2 Days for shipment (scheduled) 180519 non-null int64 3 Benefit per order 180519 non-null float64 4 Sales per customer 180519 non-null float64 5 Delivery Status 180519 non-null object 6 Late_delivery_risk 180519 non-null int64 7 Category Id 180519 non-null int64 8 Category Name 180519 non-null object 9 Customer City 180519 non-null object 10 Customer Country 180519 non-null object 11 Customer Email 180519 non-null object 12 Customer Fname 180519 non-null object 13 Customer Id 180519 non-null int64 14 Customer Lname 180511 non-null object 15 Customer Password 180519 non-null object 16 Customer Segment 180519 non-null object 17 Customer State 180519 non-null object 18 Customer Street 180519 non-null object 19 Customer Zipcode 180516 non-null float64 20 Department Id 180519 non-null int64 21 Department Name 180519 non-null object 22 Latitude 180519 non-null float64 23 Longitude 180519 non-null float64 24 Market 180519 non-null object 25 Order City 180519 non-null object 26 Order Country 180519 non-null object 27 Order Customer Id 180519 non-null int64 28 order date (DateOrders) 180519 non-null object 29 Order Id 180519 non-null int64 30 Order Item Cardprod Id 180519 non-null int64 31 Order Item Discount 180519 non-null float64 32 Order Item Discount Rate 180519 non-null float64 33 Order Item Id 180519 non-null int64 34 Order Item Product Price 180519 non-null float64 35 Order Item Profit Ratio 180519 non-null float64 36 Order Item Quantity 180519 non-null int64 37 Sales 180519 non-null float64 38 Order Item Total 180519 non-null float64 39 Order Profit Per Order 180519 non-null float64 40 Order Region 180519 non-null object 41 Order State 180519 non-null object 42 Order Status 180519 non-null object 43 Order Zipcode 24840 non-null float64 44 Product Card Id 180519 non-null int64 45 Product Category Id 180519 non-null int64 46 Product Description 0 non-null float64 47 Product Image 180519 non-null object 48 Product Name 180519 non-null object 49 Product Price 180519 non-null float64 50 Product Status 180519 non-null int64 51 shipping date (DateOrders) 180519 non-null object 52 Shipping Mode 180519 non-null object dtypes: float64(15), int64(14), object(24) memory usage: 73.0+ MB
df.isnull().sum()
Type 0 Days for shipping (real) 0 Days for shipment (scheduled) 0 Benefit per order 0 Sales per customer 0 Delivery Status 0 Late_delivery_risk 0 Category Id 0 Category Name 0 Customer City 0 Customer Country 0 Customer Email 0 Customer Fname 0 Customer Id 0 Customer Lname 8 Customer Password 0 Customer Segment 0 Customer State 0 Customer Street 0 Customer Zipcode 3 Department Id 0 Department Name 0 Latitude 0 Longitude 0 Market 0 Order City 0 Order Country 0 Order Customer Id 0 order date (DateOrders) 0 Order Id 0 Order Item Cardprod Id 0 Order Item Discount 0 Order Item Discount Rate 0 Order Item Id 0 Order Item Product Price 0 Order Item Profit Ratio 0 Order Item Quantity 0 Sales 0 Order Item Total 0 Order Profit Per Order 0 Order Region 0 Order State 0 Order Status 0 Order Zipcode 155679 Product Card Id 0 Product Category Id 0 Product Description 180519 Product Image 0 Product Name 0 Product Price 0 Product Status 0 shipping date (DateOrders) 0 Shipping Mode 0 dtype: int64
df.drop(columns=['Customer Email', 'Product Image', 'Customer Password', 'Order Zipcode', 'Product Description'], inplace=True)
df.drop(columns=['Order Profit Per Order'], inplace=True)
df.isnull().sum()
Type 0 Days for shipping (real) 0 Days for shipment (scheduled) 0 Benefit per order 0 Sales per customer 0 Delivery Status 0 Late_delivery_risk 0 Category Id 0 Category Name 0 Customer City 0 Customer Country 0 Customer Fname 0 Customer Id 0 Customer Lname 8 Customer Segment 0 Customer State 0 Customer Street 0 Customer Zipcode 3 Department Id 0 Department Name 0 Latitude 0 Longitude 0 Market 0 Order City 0 Order Country 0 Order Customer Id 0 order date (DateOrders) 0 Order Id 0 Order Item Cardprod Id 0 Order Item Discount 0 Order Item Discount Rate 0 Order Item Id 0 Order Item Product Price 0 Order Item Profit Ratio 0 Order Item Quantity 0 Sales 0 Order Item Total 0 Order Region 0 Order State 0 Order Status 0 Product Card Id 0 Product Category Id 0 Product Name 0 Product Price 0 Product Status 0 shipping date (DateOrders) 0 Shipping Mode 0 dtype: int64
df['Customer Zipcode'].fillna('Unknown',inplace=True)
df['Customer Lname'].fillna('Unknown',inplace=True)
df.isnull().sum()
Type 0 Days for shipping (real) 0 Days for shipment (scheduled) 0 Benefit per order 0 Sales per customer 0 Delivery Status 0 Late_delivery_risk 0 Category Id 0 Category Name 0 Customer City 0 Customer Country 0 Customer Fname 0 Customer Id 0 Customer Lname 0 Customer Segment 0 Customer State 0 Customer Street 0 Customer Zipcode 0 Department Id 0 Department Name 0 Latitude 0 Longitude 0 Market 0 Order City 0 Order Country 0 Order Customer Id 0 order date (DateOrders) 0 Order Id 0 Order Item Cardprod Id 0 Order Item Discount 0 Order Item Discount Rate 0 Order Item Id 0 Order Item Product Price 0 Order Item Profit Ratio 0 Order Item Quantity 0 Sales 0 Order Item Total 0 Order Region 0 Order State 0 Order Status 0 Product Card Id 0 Product Category Id 0 Product Name 0 Product Price 0 Product Status 0 shipping date (DateOrders) 0 Shipping Mode 0 dtype: int64
date_columns = ['order date (DateOrders)', 'shipping date (DateOrders)']
for col in date_columns:
df[col] = pd.to_datetime(df[col])
print(df.dtypes)
Type object Days for shipping (real) int64 Days for shipment (scheduled) int64 Benefit per order float64 Sales per customer float64 Delivery Status object Late_delivery_risk int64 Category Id int64 Category Name object Customer City object Customer Country object Customer Fname object Customer Id int64 Customer Lname object Customer Segment object Customer State object Customer Street object Customer Zipcode object Department Id int64 Department Name object Latitude float64 Longitude float64 Market object Order City object Order Country object Order Customer Id int64 order date (DateOrders) datetime64[ns] Order Id int64 Order Item Cardprod Id int64 Order Item Discount float64 Order Item Discount Rate float64 Order Item Id int64 Order Item Product Price float64 Order Item Profit Ratio float64 Order Item Quantity int64 Sales float64 Order Item Total float64 Order Region object Order State object Order Status object Product Card Id int64 Product Category Id int64 Product Name object Product Price float64 Product Status int64 shipping date (DateOrders) datetime64[ns] Shipping Mode object dtype: object
# Ensuring it is in dateTime format and extracting "Order Year" column
df['order date (DateOrders)'] = pd.to_datetime(df['order date (DateOrders)'])
df['Order Year'] = df['order date (DateOrders)'].dt.year
df['year_from_date'] = df['shipping date (DateOrders)'].dt.year.astype(int)
type_counts = df['Type'].value_counts().reset_index()
type_counts.columns = ['Payment Type', 'Count']
fig = px.bar(type_counts, x='Payment Type', y='Count',
title='Count of Orders by Payment Type',
color='Count',
color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(
xaxis_title="Payment Type",
yaxis_title="Number of Orders",
coloraxis_showscale=False
)
fig.show()
import pandas as pd
import plotly.express as px
# Counting the occurrences of each delivery status for the pie chart
delivery_status_counts = df['Delivery Status'].value_counts().reset_index()
delivery_status_counts.columns = ['Delivery Status', 'Count']
pie_chart_fig = px.pie(delivery_status_counts, values='Count', names='Delivery Status',
title='Pie Chart for Delivery Status', color_discrete_sequence=px.colors.sequential.RdBu)
pie_chart_fig.show()
df['Delivery Status'] = df['Late_delivery_risk'].apply(lambda x: 'On Time' if x == 1 else 'Late')
delivery_counts = df.groupby(['Shipping Mode', 'Delivery Status']).size().reset_index(name='Count')
fig = px.bar(delivery_counts, x='Shipping Mode', y='Count', color='Delivery Status',
barmode='group', title='Total Deliveries by Shipping Mode and Delivery Status')
fig.update_layout(xaxis={'categoryorder':'total descending'},
yaxis_title='Total Deliveries',
xaxis_title='Shipping Mode')
fig.show()
segment_yearly_sales = df.groupby(['Order Year', 'Customer Segment'])['Sales'].sum().reset_index()
fig = px.bar(segment_yearly_sales, x='Order Year', y='Sales', color='Customer Segment',
title='Total Sales by Customer Segment Over the Years')
fig.update_layout(
xaxis_title="Order Year",
yaxis_title="Total Sales",
barmode='group'
)
fig.show()
fig = px.scatter_mapbox(df,
lat='Latitude',
lon='Longitude',
hover_name='Category Name',
zoom=3,
height=600,
title='Geographic Distribution of Orders')
fig.update_layout(mapbox_style='open-street-map')
fig.show()
country_yearly_sales = df.groupby(['Order Country', 'year_from_date'])['Sales'].sum().reset_index()
# function to convert country names to ISO Alpha-3 codes
def convert_to_iso_alpha(country_name):
try:
return pycountry.countries.lookup(country_name).alpha_3
except LookupError:
return None
# Applying the function
country_yearly_sales['iso_alpha'] = country_yearly_sales['Order Country'].apply(convert_to_iso_alpha)
# Filtering out any rows that did not get an ISO code
country_yearly_sales = country_yearly_sales.dropna(subset=['iso_alpha'])
fig = px.choropleth(country_yearly_sales,
locations='iso_alpha',
color='Sales',
hover_name='Order Country',
animation_frame='year_from_date',
color_continuous_scale=px.colors.sequential.Plasma,
title='Yearly Sales by Country')
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 3000
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500
fig.show()
order_status_counts = df['Order Status'].value_counts().reset_index()
order_status_counts.columns = ['Order Status', 'Counts']
colors = ['blue', 'green', 'red', 'purple', 'orange', 'yellow']
bar_chart_fig = px.bar(order_status_counts, x='Order Status', y='Counts',
title='Counts of Different Order Statuses',
color='Order Status',
color_discrete_sequence=colors)
bar_chart_fig.show()
category_sales = df.groupby('Category Name')['Sales'].sum().nlargest(15)
category_sales = category_sales.reset_index()
fig = px.treemap(category_sales, path=['Category Name'], values='Sales',
hover_data={'Category Name': True, 'Sales': ':.2f'},
title='Top 15 Category Sales')
fig.show()
df['month_name'] = df['shipping date (DateOrders)'].dt.strftime('%B')
df_long = df.groupby(['year_from_date', 'Delivery Status']).size().reset_index(name='Count')
fig = px.bar(df_long, x='Delivery Status', y='Count',
color='Delivery Status',
animation_frame='year_from_date',
animation_group='Delivery Status',
range_y=[0, df_long['Count'].max()*1.1],
title='Delivery Status by Year')
fig.update_layout(xaxis={'categoryorder':'total descending'},
yaxis_title='Count',
xaxis_title='Delivery Status')
fig.show()
sns.set_style('whitegrid')
plt.figure(figsize=(12, 8))
ax = sns.countplot(data=df, x='Market', palette='Set2')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
for p in ax.patches:
ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', fontsize=11, color='black', xytext=(0, 5),
textcoords='offset points')
plt.title('Count of Markets', fontsize=16)
plt.show()
df['Delivery Status'] = df['Late_delivery_risk'].map({1: 'Late', 0: 'On Time'})
delivery_counts = df.groupby(['Shipping Mode', 'Delivery Status']).size().reset_index(name='Count')
plt.figure(figsize=(10, 6))
sns.barplot(data=delivery_counts, x='Shipping Mode', y='Count', hue='Delivery Status', palette='viridis')
plt.title('Total Deliveries by Shipping Mode and Delivery Status')
plt.xlabel('Shipping Mode')
plt.ylabel('Total Deliveries')
for p in plt.gca().patches:
plt.gca().annotate(format(p.get_height(), '.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center',
va = 'center',
xytext = (0, 10),
textcoords = 'offset points')
plt.show()